[Hive] Hive之复杂类型

Hive复杂数据类型(array<T>、MAP、struct)、练习

Posted by 李玉坤 on 2017-10-24

Hive复杂数据类型

array

array_contains(column, ‘value’)

hive_array.txt

1
2
pk	beijing,shanghai,tianjin,hangzhou
jepson changchu,chengdu,wuhan,beijing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
create table hive_array(name string, work_locations array<string>)
row format delimited
fields terminated by '\t' #字段之间的分隔符
COLLECTION ITEMS TERMINATED BY ','; # 数组的分隔符

#导入
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_array.txt' into table hive_array;

#查询 work_locations 第一个参数
select name,work_locations[0] from hive_array;

#查询 work_locations 的size
select name,size(work_locations) from hive_array;

#求在 tianjin
select * from hive_array where array_contains(work_locations,'tianjin') ;

MAP

MAP<primitive_type, data_type> kv
hive_map.txt

1
2
3
4
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
create table hive_map(
id int, name string, members map<string,string>, age int
)
row format delimited
fields terminated by ',' #字段之间的分隔符
COLLECTION ITEMS TERMINATED BY '#' #参数之间的分隔符
MAP KEYS TERMINATED BY ':' #key value之间的分隔符
;

#导入
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_map.txt' into table hive_map;

#取value
select id,name,members['father'] as father,members['mother'] as mother from hive_map;

#取key
select id,name,map_keys(members) as relations,age from hive_map;

#取key
select id,name,map_values(members) as relations,age from hive_map;

#size
select id,name,size(members) from hive_map;

struct

structs: STRUCT<col_name : data_type [COMMENT col_comment], …>
内部的数据类型可以不同

192.168.1.1#zhangsan:40

hive_struct.txt

1
2
3
4
192.168.1.1#zhangsan:40
192.168.1.2#lisi:50
192.168.1.3#wangwu:60
192.168.1.4#zhaoliu:70
1
2
3
4
5
6
7
8
9
10
11
12
13
create table hive_struct(
ip string, info struct<name:string,age:int>
)
row format delimited
fields terminated by '#'
COLLECTION ITEMS TERMINATED BY ':' #struct的分隔符是':'(这句语句也可以用于array)
;

#导入
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_struct.txt' into table hive_struct;

#取出
select ip,info.name,info.age from hive_struct;

练习

click_log.txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
11	ad_101	2014-05-01 06:01:12.334+01
22 ad_102 2014-05-01 07:28:12.342+01
33 ad_103 2014-05-01 07:50:12.33+01
11 ad_104 2014-05-01 09:27:12.33+01
22 ad_103 2014-05-01 09:03:12.324+01
33 ad_102 2014-05-02 19:10:12.343+01
11 ad_101 2014-05-02 09:07:12.344+01
35 ad_105 2014-05-03 11:07:12.339+01
22 ad_104 2014-05-03 12:59:12.743+01
77 ad_103 2014-05-03 18:04:12.355+01
99 ad_102 2014-05-04 00:36:39.713+01
33 ad_101 2014-05-04 19:10:12.343+01
11 ad_101 2014-05-05 09:07:12.344+01
35 ad_102 2014-05-05 11:07:12.339+01
22 ad_103 2014-05-05 12:59:12.743+01
77 ad_104 2014-05-05 18:04:12.355+01
99 ad_105 2014-05-05 20:36:39.713+01

ad_list.txt

1
2
3
4
5
ad_101	http://www.google.com	catalog8|catalog1
ad_102 http://www.sohu.com catalog6|catalog3
ad_103 http://www.baidu.com catalog7
ad_104 http://www.qq.com catalog5|catalog1|catalog4|catalog9
ad_105 http://sina.com
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
create table ad_list(ad_id string,url string,catalogs string)row format delimited fields terminated by '\t';	

人 访问的所有ad_id
去重
select cookie_id, collect_set(ad_id) from click_log group by cookie_id;
不去重
select cookie_id, collect_set(ad_id) from click_log group by cookie_id;


select
click.cookie_id,click.ad_id,click.amount,ad_list.catalogs
from
(select cookie_id, ad_id, count(1) amount from click_log group by cookie_id, ad_id) click
join ad_list
on ad_list.ad_id = click.ad_id;

针对ad_list.txt 中的catalogs列转行
ad_101 catalog8|catalog1
==>
ad_101 catalog8
ad_101 catalog1

select ad_id, catalog from ad_list lateral view outer explode(split(catalogs,'\\|')) t as catalog;

注意:\\ 转义字符
lateral view outer explode(split(catalogs,'\\|')) t as catalog


将catalogs字段转换为数组可以对其内部排序
create table ad_list_2(ad_id string,url string,catalogs array<string>)row format delimited fields terminated by '\t' COLLECTION ITEMS TERMINATED BY '|';
select ad_id,sort_array(catalogs) from ad_list_2;